Churn Prediction Model¶

In [31]:
#%pip install catboost
In [ ]:
 
In [8]:
# Libraries

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report

from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay

from sklearn.calibration import calibration_curve

import plotly.express as px

Data Exploration and Cleaning¶

In [2]:
# Reading the file
df = pd.read_csv('TakeHomeData.csv')
df.head()
Out[2]:
year client months_as_client_in_2023 client_start_date client_budget client_spend impressions_received clicks_received calls_received emails_received web_events_received leads_received client_state bc_name churn_indicator
0 2023 1 12 1/5/2023 48,406.80 47,210.69 253,746,288 5,792,808 83,868 348 10,092 94,308 FL Dentists & Dental Services 0
1 2023 2 12 1/5/2023 1,400.00 1,473.72 5,545,008 229,936 4,788 - 4,032 8,820 NY Attorneys & Legal Services 0
2 2023 3 12 1/23/2023 400,896.00 321,417.48 479,778,784 19,068,544 2,007,264 12,064 470,496 2,489,824 NC Home & Home Improvement 0
3 2023 4 12 1/1/2023 290,286.40 231,168.99 303,924,144 11,370,874 863,078 6,352 185,002 1,054,432 TX Home & Home Improvement 0
4 2023 5 12 1/11/2023 13,070.75 12,559.39 16,054,082 1,330,064 158,110 - 38,024 196,134 FL Automotive -- Repair, Service & Parts 0
In [34]:
# Feature Types
df.dtypes
Out[34]:
year                         int64
client                       int64
months_as_client_in_2023     int64
client_start_date           object
client_budget               object
client_spend                object
impressions_received        object
clicks_received             object
calls_received              object
emails_received             object
web_events_received         object
leads_received              object
client_state                object
bc_name                     object
churn_indicator              int64
dtype: object
In [35]:
# Dataset Size
df.shape
Out[35]:
(8000, 15)
In [ ]:
 
In [3]:
# Numeric Conversion

columns_to_convert = [
    'client_budget', 'client_spend', 'impressions_received', 
    'clicks_received', 'calls_received', 'emails_received', 
    'web_events_received', 'leads_received'
]

for col in columns_to_convert:
    df[col] = pd.to_numeric(df[col].str.replace(',', ''), errors='coerce')
In [37]:
# Feature Types
df.dtypes
Out[37]:
year                          int64
client                        int64
months_as_client_in_2023      int64
client_start_date            object
client_budget               float64
client_spend                float64
impressions_received          int64
clicks_received               int64
calls_received              float64
emails_received             float64
web_events_received         float64
leads_received              float64
client_state                 object
bc_name                      object
churn_indicator               int64
dtype: object
In [38]:
df.describe()
Out[38]:
year client months_as_client_in_2023 client_budget client_spend impressions_received clicks_received calls_received emails_received web_events_received leads_received churn_indicator
count 8000.0 8000.00000 8000.000000 8.000000e+03 8.000000e+03 8.000000e+03 8.000000e+03 7.647000e+03 3.122000e+03 7.646000e+03 7.854000e+03 8000.000000
mean 2023.0 4000.50000 9.789875 2.279044e+04 2.010503e+04 5.567114e+07 4.046339e+06 2.127724e+05 5.849077e+03 2.311937e+05 4.345605e+05 0.407125
std 0.0 2309.54541 3.321950 3.150488e+05 2.641288e+05 9.897457e+08 8.419519e+07 5.064632e+06 2.888547e+04 4.161257e+06 8.793591e+06 0.491329
min 2023.0 1.00000 1.000000 1.200000e+00 6.600000e-01 1.488000e+03 7.700000e+01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 0.000000
25% 2023.0 2000.75000 8.000000 8.994000e+02 8.093425e+02 1.254452e+06 8.008100e+04 3.192000e+03 1.450000e+02 1.061000e+03 5.460250e+03 0.000000
50% 2023.0 4000.50000 12.000000 2.630250e+03 2.449515e+03 4.585024e+06 2.890400e+05 1.311500e+04 5.510000e+02 4.474000e+03 2.125100e+04 0.000000
75% 2023.0 6000.25000 12.000000 7.165650e+03 6.634185e+03 1.360352e+07 8.667595e+05 4.561800e+04 2.781000e+03 1.883225e+04 7.748875e+04 1.000000
max 2023.0 8000.00000 12.000000 2.351699e+07 1.928980e+07 6.094081e+10 5.035279e+09 3.739859e+08 1.045928e+06 3.215831e+08 6.956959e+08 1.000000
In [39]:
# Missing Values
df.isnull().sum()
Out[39]:
year                           0
client                         0
months_as_client_in_2023       0
client_start_date              0
client_budget                  0
client_spend                   0
impressions_received           0
clicks_received                0
calls_received               353
emails_received             4878
web_events_received          354
leads_received               146
client_state                   6
bc_name                       22
churn_indicator                0
dtype: int64
In [40]:
# Percentage of Missing Values
df.isnull().mean()*100
Out[40]:
year                         0.0000
client                       0.0000
months_as_client_in_2023     0.0000
client_start_date            0.0000
client_budget                0.0000
client_spend                 0.0000
impressions_received         0.0000
clicks_received              0.0000
calls_received               4.4125
emails_received             60.9750
web_events_received          4.4250
leads_received               1.8250
client_state                 0.0750
bc_name                      0.2750
churn_indicator              0.0000
dtype: float64
In [4]:
df.dropna(subset=['calls_received', 'web_events_received', 'leads_received'], inplace=True)
In [ ]:
 
In [5]:
df['client_state'] = df['client_state'].fillna('Unknown')
df['bc_name'] = df['bc_name'].fillna('Unknown')
In [6]:
df.drop('emails_received', axis=1, inplace=True)
In [44]:
df.isnull().mean()*100
Out[44]:
year                        0.0
client                      0.0
months_as_client_in_2023    0.0
client_start_date           0.0
client_budget               0.0
client_spend                0.0
impressions_received        0.0
clicks_received             0.0
calls_received              0.0
web_events_received         0.0
leads_received              0.0
client_state                0.0
bc_name                     0.0
churn_indicator             0.0
dtype: float64

EDA¶

In [45]:
# Churn by Campaign Performance: Do higher impressions, clicks, or leads protect against churn?

metrics = ["impressions_received", "clicks_received", "leads_received"]
cut_percentile = 95 

plt.figure(figsize=(15, 10))

# Original Boxplots
for i, metric in enumerate(metrics, 1):
    plt.subplot(2, 3, i)  # top row
    sns.boxplot(data=df, x='churn_indicator', y=metric, palette='Set2')
    plt.title(f'{metric.replace("_", " ").capitalize()} by Churn')
    plt.xlabel('Churn Indicator')
    plt.ylabel(metric.replace('_', ' ').capitalize())

# Modified to remove outliers    
for i, metric in enumerate(metrics, 1):
    plt.subplot(2, 3, i + 3)
    cutoff_value = df[metric].quantile(cut_percentile / 100.0)
    filtered_df = df[df[metric] <= cutoff_value]
    sns.boxplot(data=filtered_df, x='churn_indicator', y=metric, palette='Set2')
    plt.title(f'{metric.replace("_", " ").capitalize()} by Churn (Without Outliers)')
    plt.xlabel('Churn Indicator')
    plt.ylabel(metric.replace('_', ' ').capitalize())
    
    # mean for each group
    group_means_filtered = filtered_df.groupby('churn_indicator')[metric].mean()
    for j, mean_val in enumerate(group_means_filtered):
        plt.scatter(j, mean_val, color='red', s=100, label='Mean' if i == 1 and j == 0 else "")

        
plt.tight_layout()
plt.show()

# the bulk of the data is concentrated at lower values for both groups
# Higher impressions, clicks, or leads alone might not strongly protect against churn
In [46]:
# Budget vs. Spend Efficiency by Churn: Are churned clients less efficient in using their budget?

df['spend_efficiency'] = df['client_spend'] / df['client_budget']

plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='churn_indicator', y='spend_efficiency', palette='coolwarm')

# Mean for each group
group_means = df.groupby('churn_indicator')['spend_efficiency'].mean()
for i, mean_val in enumerate(group_means):
    plt.scatter(i, mean_val, color='red', s=100, label='Mean' if i == 0 else "")

plt.title('Spend Efficiency by Churn Indicator')
plt.xlabel('Churn Indicator')
plt.ylabel('Spend Efficiency (Spend / Budget)')
plt.legend(title="Annotations")

plt.show()


# churned clients seem to have a wider spread in their spend efficiency,
# with a larger number of outliers at both lower and higher efficiency levels
# indicate inconsistencies in budget managment
In [ ]:
 
In [9]:
# State-wise Churn Rates: Do some states exhibit consistently higher churn rates?

# Churn Rate by State
churn_rates = df.groupby('client_state')['churn_indicator'].mean().sort_values()

fig = px.bar(
    x=churn_rates.index,
    y=churn_rates.values,
    title="Churn Rates by Client State",
    labels={'x': 'Client State', 'y': 'Churn Rate'},
    text=churn_rates.values
)

# Better Appearance
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig.update_layout(
    xaxis=dict(tickangle=90),
    yaxis_title="Churn Rate",
    xaxis_title="Client State",
    height=600,
    width=900
)

fig.show()
In [48]:
# Campaign Performance and Lead Conversion: Does better lead conversion reduce churn risk?

df['conversion_rate'] = df['leads_received'] / df['impressions_received']

# Outlier Cutoff
cutoff_v = df['conversion_rate'].quantile(0.95)
filtered_df = df[df['conversion_rate'] <= cutoff_v]

plt.figure(figsize=(16, 6))

# Original
plt.subplot(1, 2, 1)
sns.boxplot(data=df, x='churn_indicator', y='conversion_rate', palette='Set3')
mean_original = df.groupby('churn_indicator')['conversion_rate'].mean()
for i, mean_val in enumerate(mean_original):
    plt.scatter(i, mean_val, color='red', s=100, label='Mean' if i == 0 else "")
plt.title('Conversion Rate by Churn Indicator (Original)')
plt.xlabel('Churn Indicator')
plt.ylabel('Conversion Rate (Leads / Impressions)')
plt.legend(title="Annotations", loc='upper right')

# Without Outliers
plt.subplot(1, 2, 2)
sns.boxplot(data=filtered_df, x='churn_indicator', y='conversion_rate', palette='Set3')
mean_filtered = filtered_df.groupby('churn_indicator')['conversion_rate'].mean()
for i, mean_val in enumerate(mean_filtered):
    plt.scatter(i, mean_val, color='red', s=100, label='Mean' if i == 0 else "")
plt.title('Conversion Rate by Churn Indicator (Without Outliers)')
plt.xlabel('Churn Indicator')
plt.ylabel('Conversion Rate (Leads / Impressions)')
plt.legend(title="Annotations", loc='upper right')

plt.tight_layout()
plt.show()
# similar in its overall distribution
# low median conversion rate for both groups
In [ ]:
 
In [49]:
# Churn Indicator Distribution

churn_distribution = df['churn_indicator'].value_counts(normalize=True)
labels = ['Not Churned (0)', 'Churned (1)']
sizes = churn_distribution.values
plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 14})
plt.title("Churn Indicator Distribution", fontsize=16)
plt.axis('equal') # Circular
plt.show()

Feature Engineering¶

In [50]:
# Engagment to Impression Ratio
df['eng_to_imp_ratio '] = (df['clicks_received'] + df['calls_received'] + df['web_events_received'] + df['leads_received']) / df['impressions_received']

Preprocessing¶

In [51]:
# Define features and target
X = df.drop(columns=['year','months_as_client_in_2023','client','churn_indicator', 'client_start_date'])
y = df['churn_indicator']


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

Model Training¶

In [52]:
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.01,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    cat_features=['client_state', 'bc_name'],  # Categorical features
    verbose=100
)

# Train
model.fit(x_train, y_train, eval_set=(x_test, y_test), early_stopping_rounds=50)
0:	test: 0.7888070	best: 0.7888070 (0)	total: 82.1ms	remaining: 41s
100:	test: 0.8023052	best: 0.8023052 (100)	total: 424ms	remaining: 1.68s
200:	test: 0.8075692	best: 0.8075992 (199)	total: 762ms	remaining: 1.13s
300:	test: 0.8110885	best: 0.8110885 (300)	total: 1.12s	remaining: 741ms
400:	test: 0.8135437	best: 0.8136095 (399)	total: 1.54s	remaining: 381ms
499:	test: 0.8155854	best: 0.8155854 (499)	total: 1.9s	remaining: 0us

bestTest = 0.8155853971
bestIteration = 499

Out[52]:
<catboost.core.CatBoostClassifier at 0x1479d6f50>

Model Evaluation¶

In [53]:
y_pred = model.predict(x_test)
y_proba = model.predict_proba(x_test)[:, 1] 

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# Receiver Operating Characteristic - Area Under the Curve: Handles Imbalance Dataset
Accuracy: 0.7583892617449665
ROC AUC: 0.8155853970796416

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.87      0.81       897
           1       0.75      0.60      0.66       593

    accuracy                           0.76      1490
   macro avg       0.76      0.73      0.74      1490
weighted avg       0.76      0.76      0.75      1490

In [54]:
def p_roc_curve(y_test, y_proba):
    roc_auc = roc_auc_score(y_test, y_proba)
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.grid(True)
    plt.show()


def p_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Churned', 'Churned'])
    disp.plot(cmap='Blues', values_format='d')
    plt.title("Confusion Matrix")
    plt.grid(False)
    plt.show()


p_roc_curve(y_test, y_proba)
p_confusion_matrix(y_test, y_pred)

# ROC Curve vs AUC?
# Higher Precision or Higher Recall for Churn Prediction
In [55]:
feature_imp = model.get_feature_importance()
feature_names = x_train.columns
feature_imp = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_imp
})
top_features = feature_imp.nlargest(10, 'Importance')
In [56]:
# top 10 features
plt.figure(figsize=(8, 6))
plt.barh(top_features['Feature'], top_features['Importance'], color='skyblue')
plt.xlabel('Importance Score', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.title('Top 10 Feature Importances', fontsize=16)
plt.gca().invert_yaxis()  # Flip
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
In [57]:
# Calibration Plot: 

def calibration_plot(y_test, y_proba, n_bins=10):
    # Compute
    prob_true, prob_pred = calibration_curve(y_test, y_proba, n_bins=n_bins, strategy='uniform')

    plt.figure(figsize=(8, 6))
    plt.plot(prob_pred, prob_true, marker='o', label='Model')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect Calibration')
    plt.title('Calibration Plot', fontsize=16)
    plt.xlabel('Mean Predicted Probability', fontsize=14)
    plt.ylabel('Fraction of Positives', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()

calibration_plot(y_test, y_proba)

# helps in fine-tuning